package com.liusy.serachengine.parserFile;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;

import org.apache.log4j.Logger;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;

public class ParserPdf {
	 private static Logger log     = Logger.getLogger(ParserPdf.class);
	   public static StringBuffer parserPDF(String path) {
	      StringBuffer content = new StringBuffer("");// �ĵ����� 
	      try {
	         File f = new File(path);
	         if (!f.exists()) return content;
	         FileInputStream fis = new FileInputStream(path);
	         PDFParser p = new PDFParser(fis);
	         PDFTextStripper ts = new PDFTextStripper();
	         p.parse();
	         PDDocument pdd = p.getPDDocument();
	         content.append(ts.getText(pdd));
	         pdd.close();
	         fis.close();
	      }
	      catch (IOException ioe) {
	         log.error("PDF IO error:", ioe);
	         System.out.println("��ȡ�ļ�" + path + "���ı����ݳ��?����������δ���ɹ���");
	      }
	      catch (Exception e) {
	         log.error("PDF Parser error:", e);
	         System.out.println("��ȡ�ļ�" + path + "���ı����ݳ��?����������δ���ɹ���");
	      }
	      return content;
   }

}
